This notebooks aims at locally training a neural network for sentiment analysis, before deployment on Azure.
We'll compare :
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.stem import snowball
from nltk.corpus import stopwords
import spacy
import tensorflow as tf
import string
import gensim.downloader
import gensim.models
import os
import tensorflow.keras as keras
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import itertools
import keras_tuner as kt
# Load spacy model for lemmatization
spacy_model = 'en_core_web_lg'
if spacy_model not in spacy.util.get_installed_models():
!{sys.executable} -m spacy download {spacy_model}
# nltk.download('stopwords')
# enabling plots export to html
import plotly
plotly.offline.init_notebook_mode()
%%time
tweets = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None)
tweets = tweets[[5,0]]
tweets.columns = (['text', 'sentiment'])
CPU times: user 2.11 s, sys: 145 ms, total: 2.26 s Wall time: 2.36 s
tweets
| text | sentiment | |
|---|---|---|
| 0 | @switchfoot http://twitpic.com/2y1zl - Awww, t... | 0 |
| 1 | is upset that he can't update his Facebook by ... | 0 |
| 2 | @Kenichan I dived many times for the ball. Man... | 0 |
| 3 | my whole body feels itchy and like its on fire | 0 |
| 4 | @nationwideclass no, it's not behaving at all.... | 0 |
| ... | ... | ... |
| 1599995 | Just woke up. Having no school is the best fee... | 4 |
| 1599996 | TheWDB.com - Very cool to hear old Walt interv... | 4 |
| 1599997 | Are you ready for your MoJo Makeover? Ask me f... | 4 |
| 1599998 | Happy 38th Birthday to my boo of alll time!!! ... | 4 |
| 1599999 | happy #charitytuesday @theNSPCC @SparksCharity... | 4 |
1600000 rows × 2 columns
sample, _ = train_test_split(tweets, train_size=10_000, stratify=tweets['sentiment'], random_state=42)
sample
| text | sentiment | |
|---|---|---|
| 500415 | still sitting under the dryer, my neck hurts | 0 |
| 1577236 | @sarahshah this is my nightmare (even tho i on... | 4 |
| 178111 | @mjvarela black is good... tight, or should I ... | 0 |
| 396033 | Takes forever for everybody to get ready. | 0 |
| 31962 | @Bklyncookie omg all the LA bad weather aura i... | 0 |
| ... | ... | ... |
| 1282270 | @ChristinaNewman pounds are SOO over rated! I ... | 4 |
| 436582 | I'm going to miss dancing this summer. | 0 |
| 552624 | I need a hug...and less cynicism. It's making ... | 0 |
| 443309 | ugh... i have an upset stomach...ugh ... i no ... | 0 |
| 1270172 | At noodleword ! With a couple faggets. Hahah jk | 4 |
10000 rows × 2 columns
# Get training data (80%) and validation + test data (splitted at next step)
train_set, val_test_set = train_test_split(
sample, train_size=0.8, stratify=sample['sentiment'], random_state=42)
# split val_test in validation (10%) and test (10%) set
val_set, test_set = train_test_split(
val_test_set, train_size=0.5, stratify=val_test_set['sentiment'], random_state=42)
del val_test_set
print('train set shape:', train_set.shape)
print('validation set shape:', val_set.shape)
print('test set shape:', test_set.shape)
train set shape: (8000, 2) validation set shape: (1000, 2) test set shape: (1000, 2)
class DataPreprocessor:
'''
Preprocess text according to normalization method (lemmatization, stemming
or keep original form), and optionally embedding.
Process sentiment column into values 0 (for happy tweet) or 1 (for unhappy tweet).
Return dataframe with 1st column for sentiment and other columns for vectors.
'''
def __init__(self, normalization='lem', embedding="word2vec"):
if normalization.lower() not in ['lem', 'stem', 'keep']:
raise ValueError('Invalid normalization method. Valid values are'\
' "lem" (Spacy lemmatization), "stem" (nltk stemming)'\
' and "keep" (no transformation).')
self.normalization = normalization
if self.normalization == 'stem':
self.stemmer = snowball.EnglishStemmer()
elif self.normalization == 'lem':
self.nlp = spacy.load(spacy_model)
self.stop_words = stopwords.words('english')
self.vec_methods = {'word2vec':'word2vec-google-news-300',
'fasttext':'fasttext-wiki-news-subwords-300',
'glove':'glove-twitter-200'}
if embedding.lower() not in self.vec_methods:
raise ValueError('Invalid embedding method. Valid values are', ', '.join(self.vec_methods.keys()))
self.embedding = embedding
self.vectors = self._get_pretrained_vectors()
def _get_pretrained_vectors(self):
'''
Return vectors from self.embedding pretrained model.
'''
model_name = self.vec_methods.get(self.embedding)
print(f'Loading vectors for {self.embedding} model, please wait...')
vectors = gensim.downloader.load(model_name)
print('Vectors loaded.')
return vectors
def _normalize_text(self, input_string):
'''
Return input_string after lowering, deleting stop words / twitter user names /
punctuation / digits / multiple spaces, and stemming or lemmatization
according to self.normalization.
'''
result = input_string
twitter_pattern = "\@\S*"
punct_pattern = '[' + re.escape(string.punctuation) + '\d]'
result = re.sub(twitter_pattern, '', result)
result = re.sub(punct_pattern, ' ', result)
result = re.sub('[ ]{2,}', ' ', result)
if self.normalization == 'keep':
result = ' '.join([word for word in result.split() if word not in self.stop_words])
elif self.normalization == 'stem':
result = ' '.join([self.stemmer.stem(word) for word in result.split() if word not in self.stop_words])
elif self.normalization == 'lem':
result = ' '.join([tok.lemma_.lower() for tok in self.nlp(result) if tok.text not in self.stop_words])
return result.strip() or np.NaN
def _embed_sentence(self, sentence):
'''
Return the average vector of all words in the sentence.
'''
sentence_vec = np.zeros((self.vectors.vector_size,))
known_words = [word for word in sentence.split() if word in self.vectors.key_to_index]
if known_words:
sentence_vec = np.mean([self.vectors[word] for word in known_words], axis=0)
return sentence_vec
def _embed_dataset(self, dataframe, sentiment_col='sentiment', text_col='text'):
'''
For given dataframe with columns "text" and "sentiment", return
a dataframe with same nb of rows, with first column = "sentiment"
and next columns = embedding vector for "text". Embedding method
is based on self.embedding.
'''
if self.embedding != 'basic':
vec_df = pd.DataFrame(dataframe[text_col].apply(
self._embed_sentence).tolist(), index=dataframe.index)
elif self.embedding == 'basic':
vec_df = pd.DataFrame(self._basic_embedder(dataframe[text_col]).numpy())
result = pd.concat((pd.DataFrame(dataframe[sentiment_col]), vec_df), axis=1)
return result
def preprocess_dataset(self, dataframe, sentiment_col='sentiment', text_col='text', pos_label=0):
'''
Return tuple :
- Dataframe for text vectors
- Series for sentiment feature after converting the values appropriately (4--> and 0-->1)
'''
result = dataframe.copy()
result[sentiment_col] = (result[sentiment_col]==pos_label).astype(int)
result[text_col] = result[text_col].apply(self._normalize_text)
result = result.dropna()
result = self._embed_dataset(result, sentiment_col=sentiment_col, text_col=text_col)
return result.iloc[:,1:], result.iloc[:,:1]
#########################################
############ TESTS ##############
#########################################
dp = DataPreprocessor()
assert np.isnan(dp._normalize_text('123'))
assert dp._embed_sentence("cats like dogs but dogs don't like cats").shape == (300,)
print('All tests passed.')
Loading vectors for word2vec model, please wait... Vectors loaded. All tests passed.
Since for us the positive case is the case of negative/unhappy sentiment, we turn the "sentiment" column into expected values:
Text must be cleaned before embedding. We'll remove:
The we'll apply stemming or lemmatization to enhance the model performance. We'll compare performance of both methods through the model result. Here is an example of each preprocessing method:
test_string = "@mimi2000 We, finally!: went to the shopping) 12centers! 34"
print('Test string:')
print(test_string)
print('\nPreprocessed string with lemmatization:')
print(DataPreprocessor(normalization='lem')._normalize_text(test_string))
print('\nPreprocessed string with stemming:')
print(DataPreprocessor(normalization='stem')._normalize_text(test_string))
print('\nPreprocessed string with no stemming/lemmaization:')
print(DataPreprocessor(normalization='keep')._normalize_text(test_string))
Test string:
@mimi2000 We, finally!: went to the shopping) 12centers! 34
Preprocessed string with lemmatization:
Loading vectors for word2vec model, please wait...
Vectors loaded.
we finally go shopping center
Preprocessed string with stemming:
Loading vectors for word2vec model, please wait...
Vectors loaded.
we final went shop center
Preprocessed string with no stemming/lemmaization:
Loading vectors for word2vec model, please wait...
Vectors loaded.
We finally went shopping centers
For our first try, we'll use pre-trained Word2vec English model from Gensim.
dp.embedding
'word2vec'
dp.vectors.similar_by_word('cat')
[('cats', 0.8099379539489746),
('dog', 0.7609456181526184),
('kitten', 0.7464985251426697),
('feline', 0.7326233983039856),
('beagle', 0.7150582671165466),
('puppy', 0.7075453400611877),
('pup', 0.6934289932250977),
('pet', 0.6891530752182007),
('felines', 0.6755931377410889),
('chihuahua', 0.6709762811660767)]
dp.vectors.similar_by_word('dog')
[('dogs', 0.8680490851402283),
('puppy', 0.8106428384780884),
('pit_bull', 0.780396044254303),
('pooch', 0.7627375721931458),
('cat', 0.7609457969665527),
('golden_retriever', 0.7500901222229004),
('German_shepherd', 0.7465173006057739),
('Rottweiler', 0.7437615990638733),
('beagle', 0.7418619990348816),
('pup', 0.7406911253929138)]
To embed whole sentences, we'll average the vectors of each word.
Our function is ready to preprocess each dataset:
print('Preprocessing train set...')
X_train, y_train = dp.preprocess_dataset(train_set)
print('Train set preprocessed.')
print('Preprocessing val set...')
X_val, y_val = dp.preprocess_dataset(val_set)
print('Val set preprocessed.')
print('Preprocessing test set...')
X_test, y_test = dp.preprocess_dataset(test_set)
print('Test set preprocessed.')
Preprocessing train set... Train set preprocessed. Preprocessing val set... Val set preprocessed. Preprocessing test set... Test set preprocessed.
Now that we have cleaned the data, we can create the model:
dp.embedding
'word2vec'
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
lr = LogisticRegression()
lr.fit(X_train, np.ravel(y_train))
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f2score, _ = precision_recall_fscore_support(y_test, y_pred, beta=2, average='binary')
print(f'Accuracy: {accuracy:.2%}\nPrecision: {precision:.2%}\nRecall: {recall:.2%}\nF2-score:{f2score:.2%}')
Accuracy: 71.92% Precision: 72.24% Recall: 71.08% F2-score:71.31%
# Create model
def build_model(activation='tanh'):
nb_hidden_layers = 3
nb_units = 128
dropout_rate = 0.1
learning_rate = 0.01
initializers = {'tanh': keras.initializers.glorot_normal,
'relu': keras.initializers.he_normal,
'selu': keras.initializers.lecun_normal,}
initializer = initializers.get(activation)
model = keras.models.Sequential()
model.add(keras.layers.Input(shape=(X_train.shape[1])))
for _ in range(nb_hidden_layers):
model.add(keras.layers.Dense(nb_units, activation=activation, kernel_initializer=initializer))
model.add(keras.layers.Dropout(dropout_rate))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss = "binary_crossentropy",
optimizer=keras.optimizers.SGD(learning_rate=learning_rate),
metrics=['accuracy',
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall')])
return model
def plot_history(history, height=500):
'''Plot loss, accuracy, precision and recall from history.'''
history_df = pd.DataFrame(history.history)
width = height * 1.5
cols = ['loss', 'accuracy', 'precision', 'recall']
epochs = list(history_df.index)
colors = dict(zip(cols, ["red", "green", "blue", "goldenrod"]))
fig = go.Figure()
for col in cols:
fig.add_trace(go.Scatter(x=epochs, y=history_df[col],
legendgroup=col,
legendgrouptitle_text=col,
name="train "+col,
mode='lines',
line={'color': colors[col]},
hoverinfo='y'
))
val_col = 'val_' + col
fig.add_trace(go.Scatter(x=epochs, y=history_df[val_col],
legendgroup=col,
name="validation "+col,
mode='lines',
line={'color': colors[col], 'dash':'dot'},
hoverinfo='y'
))
fig.update_layout(width=width, height=height, title="Training results (click on legend items to hide lines)")
fig.update_xaxes(title_text='epochs')
fig.update_yaxes(range=[0,1], title_text='value')
fig.show()
# Create custom function for Tensorboards logfiles
root_logdir = os.path.join(os.curdir, "my_logs")
def get_run_logdir():
import time
run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
return os.path.join(root_logdir, run_id)
run_logdir = get_run_logdir()
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)
# Fit model
model = build_model()
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val),
verbose=0, callbacks=[tensorboard_cb,
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)])
plot_history(history)
The recall oscillates a lot. Maybe tuning the batch size will help? Let's train the model with different batch sizes, then, for each serie of resulting val_recall, compute its standard deviation:
val_recall_std_for_batch_size = dict()
for batch_size in tqdm(range(32, 513, 32)):
model = build_model()
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val),
batch_size=batch_size, verbose=0, callbacks=[tensorboard_cb,
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)])
val_recall_std_for_batch_size.update({batch_size: np.std(history.history['val_recall'])})
val_std_df = pd.DataFrame.from_dict(val_recall_std_for_batch_size, orient='index').reset_index()
val_std_df['index'] = val_std_df['index'].astype(str)
px.bar(val_std_df, x='index', y=0,
title='Standard deviation of val_recall according to batch size', labels={'index': 'batch_size', '0':'val_recall standard deviation'})
Batch size does not seem to be significant to reduce val_recall oscillations. Maybe another activation function may help?
for activation_function in ['relu', 'selu']:
print(f'With {activation_function} activation')
model = build_model(activation_function)
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val),
verbose=0,
callbacks=[tensorboard_cb,
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)])
plot_history(history)
With relu activation
With selu activation
We notice that the model converges muche faster with SELU activation function, but it stil oscillates a lot.
We used lemmatization and Word2vec embedding. Let's compare with other normalizing and embedding methods.
# get a bigger sample
sample, _ = train_test_split(tweets, train_size=80_000, stratify=tweets['sentiment'], random_state=42)
# Get training data (80%) and validation + test data (splitted at next step)
train_set, val_test_set = train_test_split(
sample, train_size=0.8, stratify=sample['sentiment'], random_state=42)
# split val_test in validation (10%) and test (10%) set
val_set, test_set = train_test_split(
val_test_set, train_size=0.5, stratify=val_test_set['sentiment'], random_state=42)
del val_test_set
results = pd.DataFrame(columns=['normalization', 'embedding', 'accuracy', 'precision', 'recall', 'f2 score'])
preproc_params = ['stem', 'lem']
vecto_params = ['word2vec', 'glove', 'fasttext']
params = [tup for tup in itertools.product(preproc_params, vecto_params)]
for normalization, embedding in tqdm(params):
# preprocess datasets
print(f'Preprocessing datasets with {normalization} and {embedding}')
dp = DataPreprocessor(normalization=normalization, embedding=embedding)
X_train, y_train = dp.preprocess_dataset(train_set)
X_val, y_val = dp.preprocess_dataset(val_set)
X_test, y_test = dp.preprocess_dataset(test_set)
# Create new model
model = build_model('selu')
# Fit model
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val),
callbacks=[tensorboard_cb,
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)],
verbose=0)
print('Model fitted')
# save results
_, accuracy, precision, recall = model.evaluate(X_test, y_test, verbose=0)
f2_score = 5 * precision * recall / ((4 * precision)+ recall)
row = pd.Series(dict(zip(results.columns, [normalization,
embedding,
accuracy,
precision,
recall,
f2_score
])))
results = results.append(row, ignore_index=True)
Preprocessing datasets with stem and word2vec Loading vectors for word2vec model, please wait... Vectors loaded. Model fitted Preprocessing datasets with stem and glove Loading vectors for glove model, please wait... Vectors loaded. Model fitted Preprocessing datasets with stem and fasttext Loading vectors for fasttext model, please wait... Vectors loaded. Model fitted Preprocessing datasets with lem and word2vec Loading vectors for word2vec model, please wait... Vectors loaded. Model fitted Preprocessing datasets with lem and glove Loading vectors for glove model, please wait... Vectors loaded. Model fitted Preprocessing datasets with lem and fasttext Loading vectors for fasttext model, please wait... Vectors loaded. Model fitted
results.sort_values(by='f2 score').style.format(dict.fromkeys(results.columns[2:], '{:.2%}'))
| normalization | embedding | accuracy | precision | recall | f2 score | |
|---|---|---|---|---|---|---|
| 1 | stem | glove | 73.44% | 72.14% | 76.41% | 75.52% |
| 3 | lem | word2vec | 74.44% | 73.51% | 76.46% | 75.85% |
| 0 | stem | word2vec | 72.13% | 69.98% | 77.59% | 75.94% |
| 2 | stem | fasttext | 72.18% | 69.82% | 78.19% | 76.36% |
| 5 | lem | fasttext | 74.63% | 73.35% | 77.41% | 76.56% |
| 4 | lem | glove | 75.13% | 73.86% | 77.84% | 77.01% |
Lemmatization with glove embedding seems to be the best combination, we'll use it for next steps. But there is not a huge difference between all combinations.
dp = DataPreprocessor('lem', 'glove')
Loading vectors for glove model, please wait... Vectors loaded.
X_train, y_train = dp.preprocess_dataset(train_set)
X_val, y_val = dp.preprocess_dataset(val_set)
X_test, y_test = dp.preprocess_dataset(test_set)
for dataset in [X_train, X_val, X_test, y_train, y_val, y_test]:
name = [x for x in globals() if globals()[x] is dataset][0]
filename = name + ".csv"
dataset.to_csv(name)
Since the recall is rather wobbly, we won't monitor it for tuning hyperparameters, but val_loss instead.
def build_model(hp):
nb_hidden_layers = hp.Choice('nb_hidden_layers', values=[1, 2, 3, 4, 5, 6])
nb_units = hp.Choice('nb_units', values = [8,16,32,64,128])
initializers = {'tanh': keras.initializers.glorot_normal,
'relu': keras.initializers.he_normal,
'selu': keras.initializers.lecun_normal,}
activation = hp.Choice('activation', values=list(initializers.keys()))
dropout = hp.Boolean('dropout')
dropout_rate = hp.Float("dropout_rate", min_value=0.1, max_value=0.5)
learning_rate = hp.Float("learning_rate", min_value=0.0001, max_value=0.1, sampling='log')
optimizer = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop'])
model = keras.models.Sequential()
model.add(keras.layers.Input(shape=(X_train.shape[1])))
for _ in range(nb_hidden_layers):
model.add(keras.layers.Dense(nb_units, activation=activation,
kernel_initializer=initializers[activation]))
if dropout:
model.add(keras.layers.Dropout(dropout_rate))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss = "binary_crossentropy",
optimizer=optimizer,
metrics=['accuracy',
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall')])
return model
tuner = kt.BayesianOptimization(hypermodel=build_model,
objective=kt.Objective("val_loss",direction='min'),
max_trials=50,
overwrite=True,
directory='my_dir',
project_name='essai')
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
tuner.search(X_train, y_train, validation_data=(X_val, y_val), callbacks=[stop_early],)
Trial 50 Complete [00h 00m 02s] val_loss: 0.5070626139640808 Best val_loss So Far: 0.503059446811676 Total elapsed time: 00h 02m 52s INFO:tensorflow:Oracle triggered exit
Now that the tuner has found good parameters, we can use them in our model:
print('Best parameters:')
for key, value in tuner.get_best_hyperparameters()[0].values.items():
print(key, ':', value)
model = build_model(tuner.get_best_hyperparameters()[0])
history = model.fit(X_train, y_train,
validation_data=(X_val, y_val),
callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)],
epochs=50,
verbose=0)
Best parameters: nb_hidden_layers : 1 nb_units : 32 activation : relu dropout : False dropout_rate : 0.1 learning_rate : 0.0001 optimizer : adam
plot_history(history)
min_recall = min(history.history['val_recall'])
max_recall = max(history.history['val_recall'])
avg_recall = np.mean(history.history['val_recall'])
print(f"Recall is comprised between {min_recall:.2%} and {max_recall:.2%}, with an average of {avg_recall:.2%}.")
Recall is comprised between 71.41% and 79.54%, with an average of 75.96%.
This is a little better than our baseline recall on a simple logistic regression (72%).